Introduction

This notebook documents the creation of a B-ALL classifier constructed from two datasets ( St.Judes and Lund). The classifier will be a multi-class classifier utilisting the Random Forest algorithm native as implemented initially in Fortran and then R. By utilising two different datasets we hope to mitigate batch specific effects and biases as well as improve statistical power and classification clout.\

Datasets

St.Judes

We have the log FPKM dataset for 289 patients at St.Judes (~150 with Phlike or Ph+) in which to use to train and test our classifier. The classifier aims to seperate patients into one of four categories: Phlike, ERG, ETV and Other (a miscellaneous category).

Lund

We have the RAW RNA-seq data for 195 patients taken at Lund University. The samples are for patients with paediatric B-Cell precursor acute lumphoblastic leukaemia (BCP ALL). Of these samples 127 (65%) had in-fram fusion genes observed. The Lund samples provide an additional group to the St.Judes which we call DUX4 (a recurrent IGH-DUX4 or ERG-DUX4 fusion).

Collating and Munging

Firstly, we need to read in our data, do some munging and sculpt it into a useable form.

library(edgeR)
library(limma)
library(gplots)
library(statmod)
library(RColorBrewer)
library(randomForest)
library(biomaRt)
library(dplyr)
library(RUVSeq)

normalisedcounts= read.csv(file="../../MCRI/BALL/ALL_FPKM_filtered_genes_log2signal_annot.txt",sep="\t",stringsAsFactors=FALSE,row.names=1)

#Just take the sample columns, leave out the other annotational columns
ncounts = normalisedcounts[,1:289]

#Normalise counts
#ncounts2 = t(apply(ncounts,1, function(x) (x - min(x))/(max(x) - min(x))))
#normalisedcounts[,1:289] = ncounts2


#Figure out the classes
#Make the groupings from Phlike and non phlike
pheno = strsplit2(colnames(ncounts),".Aligned",fixed=TRUE)[,1]
#Change underscore string to dots
pheno = gsub("___________",".",pheno)
pheno = gsub("__________",".",pheno)
pheno = gsub("_________",".",pheno)
pheno = gsub("________",".",pheno)
pheno = gsub("_______",".",pheno)

tmp = strsplit2(pheno,".",fixed=TRUE) #Split the string by .
#Make chained ifelse statement
targets = data.frame(Fusion = ifelse(grepl("Phlike", tmp[,1]) | grepl("PHALL", tmp[,1]), "Phlike", 
                                   ifelse(grepl("ERG", tmp[,1]), "ERG",
                                          ifelse(grepl("ETV", tmp[,1]),"ETV","Other"))),
                   Sex = ifelse(t(ncounts["XIST",] > 3) , "F", "M") #t() transpose into a vector of rows, rather than columns
)

targets$Sample = pheno
#Rename samples
colnames(normalisedcounts)[1:289] = pheno

targets$Original_Class = targets$Fusion
targets$Original_Class = ifelse(targets$Fusion == "Other",tmp[,1],as.character(targets$Fusion))
#head(targets)
ntargets=targets

Gather the Lund data

#Now read in Lund data 
lcounts= read.table(file="../../MCRI/BALL/EGAD00001002112/counts.txt",sep="\t",stringsAsFactors=FALSE,row.names=1,header=TRUE)


colnames(lcounts) = gsub('.*_EGAR.*_C','C',colnames(lcounts))
colnames(lcounts) = gsub('.Aligned.out.bam','',colnames(lcounts))
colnames(lcounts) = gsub('_0*','_',colnames(lcounts))

llength = data.frame(Length = lcounts[,'Length'])

#Make DGE list
y.keep <- DGEList(counts=lcounts[,6:200])


#Remove XIST to avoid gender differences
#remove <- row.names(y.keep) == "XIST"
#y.keep = y.keep[!remove,]

y.keep = calcNormFactors(y.keep) #Normalise the raw counts

#Extract gene lengths
FPKM <- rpkm(y.keep,normalized.lib.sizes=TRUE,log=TRUE,prior.count=0.25,gene.length = lcounts$Length)
#FPKM <- rpkm(y.keep,log=TRUE,gene.length = lcounts$Length)
FPKM <- data.frame(FPKM)
FPKM$Gene_Symbol = row.names(FPKM)
#head(FPKM)


#Lund sample data
lsamp= read.csv(file="../../MCRI/BALL/Supplementary_Lund/ncomms11790-s4.csv",stringsAsFactors=FALSE,row.names=1,skip=1)

#Include HyperDiploidy as fourth category
#ltargets = data.frame(Fusion = ifelse(grepl("BCR-ABL1", lsamp[,2]) | grepl("Ph-like", lsamp[,2]), "Phlike", 
#                                   ifelse(grepl("DUX4", lsamp[,2]),"ERG", ifelse(grepl("hyperdiploidy", lsamp[,2]), "HyperDiploidy",
#                                          ifelse(grepl("ETV", lsamp[,2]),"ETV","Other")))))


#Remove hyperdiploidy from categories
ltargets = data.frame(Fusion = ifelse(grepl("BCR-ABL1", lsamp[,2]) | grepl("Ph-like", lsamp[,2]), "Phlike", 
                                   ifelse(grepl("DUX4", lsamp[,2]),"ERG", 
                                          ifelse(grepl("ETV", lsamp[,2]),"ETV","Other"))))

ltargets$Sample = paste0("Case_",row.names(lsamp))

ltargets$Original_Class = ifelse(ltargets$Fusion == "Other",lsamp[,2],as.character(ltargets$Fusion))

#Now re-order the targets to match the FPKM/y.keep
m <- match(colnames(y.keep$counts),ltargets$Sample)
m <- m[!is.na(m)]
ltargets <- ltargets[m,] #Reorder

#head(ltargets)

Now we wish to combine both datasets together only including samples from the class we wish to classify.

#Lets use dplyr to merge the data based on the shared gene symbols
all = inner_join(normalisedcounts,FPKM,by="Gene_Symbol")
row.names(all) <- all$Gene_Symbol
alls = all[,c(1:289,296:490)]

#Combine sample fusion info
all_samp = rbind(targets[,c(1,3,4)],ltargets)

Visualise

In order to visualise the two datasets together and to understand how significant the batch effects are we want to produce PCA plots and colour them variously.

#Define a new colour palette with enough colours for the nine different categories
pal = brewer.pal(9,"Set1")
#Define marker set as a vector
pchvec <- c(1:5)
batch <- c(rep(1,289),rep(2,195))

plotMDS(alls,top=1000,gene.selection="common",cex=0.8,col=pal[all_samp$Fusion],pch=pchvec[batch])
legend('center',legend=unique(levels(all_samp$Fusion)),fill=pal,cex=0.8)
legend('top',legend=c("St.Judes","Lund"),pch=pchvec)

Colour by batch

pal = brewer.pal(9,"Set1")
#Define marker set as a vector
pchvec <- c(1:2)
batch <- c(rep(1,289),rep(2,195))

#First 2 dimensions, colour by fusion
plotMDS(alls,top=1000,gene.selection="common",cex=0.8,col=pal[batch],pch=pchvec[batch])

Now, we clearly have some batch effects in our data…. But can we try to remove the affects by using RUVseq

design <- model.matrix(~all_samp$Fusion)
fit <- lmFit(alls,design)
efit <- eBayes(fit[,-1],trend=TRUE,robust=TRUE) #Remove the intercept [,-1]
ranked_p = rank(-efit$F.p.value)
my_genes = row.names(efit)[ranked_p<=1000]
differences <- makeGroups(all_samp$Fusion)

#ruv <- RUVg(as.matrix(alls),cIdx=mygenes,k=1,isLog=TRUE)

#Ruvs with all genes using replicates
ruv <- RUVs(as.matrix(alls),cIdx=row.names(efit),k=2,differences,isLog=TRUE)

Now lets see how big the batch effects are:

res2= ruv$normalizedCounts#Remove batch
#res2=alls #Keep in batch effects

pal = brewer.pal(9,"Set1")
#Define marker set as a vector
pchvec <- c(1:2)
batch <- c(rep(1,289),rep(2,195))

#First 2 dimensions, colour by fusion
plotMDS(res2,top=1000,gene.selection="common",cex=0.8,col=pal[batch],pch=pchvec[batch])
legend('bottomleft',legend=c("St.Judes","Lund"),fill=pal,pch=pchvec)

So in the above plot, we can see that we have removed hopefully alot of the batch effects without removing too much of the biology….

Looking at just the biological variation:

#Define a new colour palette with enough colours for the nine different categories
pal = brewer.pal(9,"Set1")
#Define marker set as a vector

pchvec <- c(1:2)
batch <- c(rep(1,289),rep(2,195))
plotMDS(res2,top=1000,gene.selection="common",cex=0.8,col=pal[all_samp$Fusion],pch=pchvec[batch])
legend('bottomright',legend=unique(levels(all_samp$Fusion)),fill=pal,pch=pchvec,cex=0.8)
legend('topleft',legend=c("St.Judes","Lund"),pch=pchvec)

Lets also have a look at the sub-classes of Other to see if there is some clustering there:

n <- 20
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
color = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))

ggplotColors <- function(g){
  d <- 360/g
  h <- cumsum(c(15, rep(d,g - 1)))
  hcl(h = h, c = 100, l = 65)
}

color = ggplotColors(16)

pchvec <- c(1:2)
batch <- c(rep(1,289),rep(2,195))

plotMDS(res2,gene.selection="common",cex=0.8,col=color[factor(all_samp$Original_Class)],pch=pchvec[batch])
legend('bottomright',legend=unique(levels(factor(all_samp$Original_Class))),fill=color,pch=16,cex=0.5,ncol=2)
legend('topleft',legend=c('St.Judes','Lund'),pch=unique(pchvec),cex=0.5)

So we can remove the batch effects if we wish too, however in order to have a more robust classifier we can keep them in! Now lets train on all the genes!!

bnorm = res2
#######################################################
#Find the 1000 most variable genes in the entire dataset
#######################################################

#Probably a bad strategy since the highest variance is associated with batch......
#vars <-apply(bnorm,1,sd)
#sort
#vars <- sort(vars,decreasing=TRUE)

#Now just select the 5000 most variable genes, based on the names of the genes
#datExpr <- bnorm[names(vars[1:10000]),]

##########################################################
# Find genes which are the most variable in each set
# since the largest variation in each set is biology
# Use the overlap of the too
##########################################################
#High variable lund
var_lund <-apply(lcounts[6:200],1,sd)
#sort
var_lund <- sort(var_lund,decreasing=TRUE)

#Highly variable St.Judes
var_jude <-apply(ncounts,1,sd)
#sort
var_jude <- sort(var_jude,decreasing=TRUE)

comb_var = intersect(names(var_lund)[1:2000],names(var_jude)[1:2000])

#Now just select the 5000 most variable genes, based on the names of the genes
datExpr <- bnorm[comb_var,]

#Only the non "Other" samples
#cands <- all_samp[all_samp$Fusion != "Other",]

#Remove the High Hyperdiploidy samples which are possibly confounded
#cands <- all_samp[all_samp$Original_Class != "High hyperdiploidy",]



#Include the others too!
cands <- all_samp

datExpr2 <- datExpr[,cands$Sample]

Now divvy up the entire dataset into training data (80% of samples) and test data (20 % of samples) in order to construct and test our random forest classifier.

#Now subset entire dataset into training and test datasets
#sample some random rows
resp <- cands$Fusion
resp <- as.data.frame(resp)
resp$resp <- factor(resp$resp)

#Transpose datExpr to be in sample by gene
datExpr3 <- t(datExpr2)


nTest = ceiling(nrow(datExpr3) * 0.2) #Take 80% of the rows randomly from the dataset for testing
set.seed(1) #Set a seed for reproducibility
ind = sample(nrow(datExpr3),nTest,FALSE) #Sample a random set of indices comprising 20%

df.train = datExpr3[-ind,] #Take last 80% for training
df.test = datExpr3[ind,] #Using residuals
#df.test = d3[ind,] #Take first 20% for testing, using FPKM not batch normalised

classtrain = resp[-ind,]
classtest = resp[ind,]

Use cross validation to decided on what (hyper) parameters to use for mtry (the number of variables to subset on for each node) and ntree (the number of trees to use when construct the random forest). A 5 fold cross validation is used.

#Now cross-validate RF to do feature selection and find mtry,ntree e.t.c
result <- rfcv(data.frame(df.train), classtrain, cv.fold=5)
with(result, plot(n.var, error.cv, log="x", type="o", lwd=2))

This demonstrates that checking roughly 50 genes gives a decent error. Now we can tune.

#Now tuneRF to find mtry and ntree
tuneRF(data.frame(df.train),classtrain,mtryStart=50,stepFactor=0.5)
## mtry = 50  OOB error = 8.53% 
## Searching left ...
## mtry = 100   OOB error = 6.46% 
## 0.2424242 0.05 
## mtry = 200   OOB error = 7.75% 
## -0.2 0.05 
## Searching right ...
## mtry = 25    OOB error = 8.53% 
## -0.32 0.05

##         mtry   OOBError
## 25.OOB    25 0.08527132
## 50.OOB    50 0.08527132
## 100.OOB  100 0.06459948
## 200.OOB  200 0.07751938

Looks like 80 is actually a pretty good value. So lets build a random forest from all most variable genes in the datasets with the parameters we have cross validated.

#Now train the random forest
set.seed(1) #Set the seed in order to gain reproducibility
#Random forest only accepts a data.frame, not a matrix, hence conver
RF1 = randomForest(classtrain~., data=data.frame(df.train),importance=T,mtry=100,proximity=TRUE)
RF1
## 
## Call:
##  randomForest(formula = classtrain ~ ., data = data.frame(df.train),      importance = T, mtry = 100, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 100
## 
##         OOB estimate of  error rate: 5.68%
## Confusion matrix:
##        ERG ETV Other Phlike class.error
## ERG     49   1     1      1  0.05769231
## ETV      0  91     0      0  0.00000000
## Other    0   3    97      6  0.08490566
## Phlike   0   2     8    128  0.07246377

Variable Importance

Now we can extract the most important variables used in the random forest classification.

#Extract variable importance
var.imp = data.frame(importance(RF1))
#Looking into the importance of different variables
varImpPlot(RF1)

#Extract Gini Importance
var.imp = data.frame(importance(RF1))
RFImportanceGini=var.imp$MeanDecreaseGini

#Looking at top Genes
set.seed(1)
NumberImportantGenes=c(1:100)
OOBerror=rep(NA, length(NumberImportantGenes) )
for (i in c(1:length(NumberImportantGenes)) ) {
topGenes= rank(-RFImportanceGini, ties.method="first" ) <= i
#Since the variables are highly correlated, we choose a small value for mtry (mtry=1).


RF2 <- randomForest(classtrain~., data=data.frame(df.train[,topGenes]), ntree=201,
                    importance=F, mtry=1)
OOBerror[i]=1-sum(diag(table(RF2$predicted,classtrain)))/length(classtrain)
}
plot(NumberImportantGenes,OOBerror ,cex=1.5,xlab="Number of Top Most
   Important Genes", ylab="RF Error Rate",cex.lab=1.5 )

Furthermore we can see the clustering of the various groups using the proximity matrix of the random forest to see how well the groups seperate.

#View MDS
MDSplot(RF1, classtrain,palette=pal)
legend("bottomleft",legend=unique(levels(classtrain)),fill=pal,cex=0.7)

Building a random forest from the top genes

So as we can see almost all of the dataset is described by fourty-two genes, so in order to have a more general model we can simply build a random forest on the those fourty two genes:

#Get and save top gene names
mDGini = var.imp[,"MeanDecreaseGini",drop=FALSE]
topGeneNames = mDGini[order(-mDGini$MeanDecreaseGini),,drop=FALSE]
#Just take top 20
topGeneNames = topGeneNames[1:20,,drop=FALSE]
topgenes = row.names(topGeneNames)
topgenes=gsub("\\.","-",topgenes) #Weird bug which switches a dot with a dash when using RF???
#Cross validating the number of features to use
cvrf = rfcv(df.train[,topgenes],classtrain,cv.fold=5,step=0.5)
print(cvrf$error.cv)
##         20         10          5          2          1 
## 0.08010336 0.08527132 0.24289406 0.38501292 0.58397933
#Tune for mtry
tuneRF(df.train[,topgenes],classtrain,stepFactor = 0.5,ntreeTry=401,mtryStart=5)
## mtry = 5  OOB error = 5.94% 
## Searching left ...
## mtry = 10    OOB error = 6.98% 
## -0.173913 0.05 
## Searching right ...
## mtry = 2     OOB error = 6.2% 
## -0.04347826 0.05

##        mtry   OOBError
## 2.OOB     2 0.06201550
## 5.OOB     5 0.05943152
## 10.OOB   10 0.06976744
set.seed(1)
RF2 <-randomForest(classtrain~., data=data.frame(df.train[,topgenes]), importance=T, mtry=5, proximity=T)
RF2
## 
## Call:
##  randomForest(formula = classtrain ~ ., data = data.frame(df.train[,      topgenes]), importance = T, mtry = 5, proximity = T) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 6.46%
## Confusion matrix:
##        ERG ETV Other Phlike class.error
## ERG     50   1     1      0  0.03846154
## ETV      0  90     1      0  0.01098901
## Other    0   2    94     10  0.11320755
## Phlike   0   2     8    128  0.07246377
#Save the RF object and topgenes
save(RF2,file="RF_model.Rda")
save(topgenes,file="topGeneNames.Rda")

Now lets explore our random forest to see if it is doing what we think and want, so lets again consider the MDs plot made from the proximity matrix, but also just look at a heatmap based on a hierachical clustering of the top 20 genes used in our random forest as a cross-check:

topdata = res2[topgenes,]
yx <- as.matrix(topdata)

heatmap.2(yx,colCol=pal[all_samp$Fusion],trace = "none",col =brewer.pal(11,"Spectral"))

#Check how the MDS plot looks like with just these top genes
pal = brewer.pal(9,"Set1")
#Define marker set as a vector
pchvec <- c(1:2)
batch <- c(rep(1,289),rep(2,195))

plotMDS(res2[topgenes,],gene.selection="common",cex=0.8,col=pal[all_samp$Fusion],pch=pchvec[batch])
legend('bottomright',legend=unique(levels(all_samp$Fusion)),fill=pal,pch=16)
legend('bottomleft',legend=c('St.Judes','Lund'),pch=unique(pchvec),cex=0.5)

Subclasses of other

But the question is, do the subclasses of other classify?

#Check how the MDS plot looks like with just these top genes
#color = grDevices::colors()[grep('gr(a|e)y', grDevices::colors(), invert = T)]

color = ggplotColors(16)

pchvec <- c(1:2)
batch <- c(rep(1,289),rep(2,195))

plotMDS(res2[topgenes,],gene.selection="common",cex=0.8,col=color[factor(all_samp$Original_Class)],pch=pchvec[batch])
legend('bottomleft',legend=unique(levels(factor(all_samp$Original_Class))),fill=color,pch=16,cex=0.5,ncol=2)
legend('bottomright',legend=c('St.Judes','Lund'),pch=unique(pchvec),cex=0.5)

Check for batch effects in top genes

Nowe we also just want to check whether the topgenes are biased to batches

#Check how the MDS plot looks like with just these top genes
pal = brewer.pal(9,"Set1")

pchvec <- c(1:2)
batch <- c(rep(1,289),rep(2,195))

plotMDS(res2[topgenes,],gene.selection="common",cex=0.8,col=pal[batch],pch=pchvec[batch])
legend('topright',legend=c("St.Judes","Lund"),fill=pal,pch=pchvec)

#Time to Test Now we wish to understand how well the randomForest does on predicting on the test dataset.

RF2pred <- predict(RF2,data.frame(df.test))
RF2prob <-predict(RF2,data.frame(df.test),type="prob")

So as you can see the randomForest is doing a reasonable job and does what we think! Lets visualise in a confusion matrix:

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
#First make matrix
conf = RF2$confusion[,c(1:4)]

#Now lets normalise
#conf.norm = apply(conf,2, function(x) x/sum(x))

#Rejig matrix to table then to dataframe
confusion = as.data.frame(as.table(conf))
values = c(0.05,1.0)
gg <- ggplot(confusion,aes(x=Var1,y=Var2,fill=Freq))
gg + geom_tile() + labs(fill="Frequency",x="Truth",y="Predicted") + geom_text(aes(label=Freq),show.legend = FALSE,colour="white")

out = data.frame(RF2prob)
out = cbind(out,RF2pred)
out$True = classtest
Maj = apply(RF2prob[,1:4],1,max)
out$Maj = Maj 
head(out)
##                                     ERG   ETV Other Phlike RF2pred   True
## iAMP21.SJBALL021900_D1.TB.95.0559 0.078 0.004 0.616  0.302   Other  Other
## Phlike.SJBALL021076_D1.PASZCR     0.000 0.000 0.056  0.944  Phlike Phlike
## PHALL.SJPHALL020037_D1.PANLSC     0.160 0.004 0.050  0.786  Phlike Phlike
## Case_148                          0.004 0.002 0.920  0.074   Other  Other
## ETV.SJETV089_D.TB.06.1784         0.008 0.900 0.054  0.038     ETV    ETV
## Case_142                          0.006 0.854 0.014  0.126     ETV    ETV
##                                     Maj
## iAMP21.SJBALL021900_D1.TB.95.0559 0.616
## Phlike.SJBALL021076_D1.PASZCR     0.944
## PHALL.SJPHALL020037_D1.PANLSC     0.786
## Case_148                          0.920
## ETV.SJETV089_D.TB.06.1784         0.900
## Case_142                          0.854
#First make matrix
conf = table(out[,c(5,6)])
print(conf)
##         True
## RF2pred  ERG ETV Other Phlike
##   ERG     11   0     0      0
##   ETV      0  17     0      0
##   Other    0   0    28      3
##   Phlike   0   0     2     36
#Rejig matrix to table then to dataframe
confusion = as.data.frame(as.table(conf))
gg <- ggplot(confusion,aes(x=RF2pred,y=True,fill=Freq))  
gg + geom_tile() + labs(fill="Frequency") + geom_text(aes(label=Freq),show.legend = FALSE,colour="white")

Choosing a Threshold

So now the question we want to address is, what happens when we have a sample whom doesn’t fit into one of the above four samples. Well, in that case we wish the sample to be called as “Unclassified”. So at what probability threshold should we consider something as unclassified rather than one of the other classes?

#Lets take the test set and the "Other" samples and see what goes down
cands <- all_samp[all_samp$Fusion == "Other",]
autre <- datExpr[topgenes,cands$Sample]

#If we excluded the Others add them in 
#combo = rbind(df.test[,topgenes],t(autre)) 

#Otherwise
combo = df.test[,topgenes]


RF2pred_autre <- predict(RF2,data.frame(combo))
RF2prob_autre <-predict(RF2,data.frame(combo),type="prob")

autre_out = data.frame(RF2prob_autre)
autre_out = cbind(autre_out,RF2pred_autre)

#If excluded Others add them in
#autre_out$True = c(as.character(classtest),as.character(cands$Fusion))
#otherwise
autre_out$True = c(as.character(classtest))

Maj = apply(RF2prob_autre[,1:4],1,max)
autre_out$Prob = Maj 

How about class specific thresholds?

#Find MLL samples as litmus tests
mlls= row.names(lsamp[lsamp$Genetic.subtype.after.RNA.seq == "MLL",])

gg <- ggplot(autre_out)

#ETV
gg + geom_density(aes(ETV),col="red") + 
geom_vline(xintercept=autre_out["Case_22","ETV"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Case_58","ETV"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Case_170","ETV"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Case_152","ETV"],col="purple",linetype=2) +
geom_vline(xintercept=autre_out[autre_out$True=="ETV","ETV"],col="yellow")    
## Warning: Removed 1 rows containing missing values (geom_vline).

## Warning: Removed 1 rows containing missing values (geom_vline).

#ERG
gg + geom_density(aes(ERG),col="blue") + geom_vline(xintercept=autre_out["Case_10","ERG"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Case_170","ERG"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Case_163","ERG"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Case_152","ERG"],col="purple",linetype=2) + geom_vline(xintercept=autre_out[autre_out$True=="ERG","ERG"],col="yellow")  
## Warning: Removed 1 rows containing missing values (geom_vline).

## Warning: Removed 1 rows containing missing values (geom_vline).

## Warning: Removed 1 rows containing missing values (geom_vline).

#Phlike
gg + geom_density(aes(Phlike),col="green") + geom_vline(xintercept=autre_out["Case_10","Phlike"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Case_170","Phlike"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Case_163","Phlike"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Case_152","Phlike"],col="purple",linetype=2) + 
geom_vline(xintercept=autre_out[autre_out$True=="Phlike","Phlike"],col="yellow")    
## Warning: Removed 1 rows containing missing values (geom_vline).

## Warning: Removed 1 rows containing missing values (geom_vline).

## Warning: Removed 1 rows containing missing values (geom_vline).

#Hyperdiploidy
#hyper= row.names(lsamp[lsamp$Genetic.subtype.after.RNA.seq == "HyperDiplody",])
#hyp = gg + geom_density(aes(HyperDiploidy),col="black") + geom_vline(xintercept=autre_out["Sample_10","HyperDiploidy"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Sample_170","HyperDiploidy"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Sample_163","HyperDiploidy"],col="purple",linetype=2) + geom_vline(xintercept=autre_out["Sample_152","HyperDiploidy"],col="purple",linetype=2) +
#geom_vline(xintercept=autre_out["Sample_23","HyperDiploidy"],col="yellow",linetype=1) +
#geom_vline(xintercept=autre_out[autre_out$True=="HyperDiploidy","HyperDiploidy"],col="yellow")
#hyp

#Other
hyper= row.names(lsamp[lsamp$Genetic.subtype.after.RNA.seq == "Other",])
hyp = gg + geom_density(aes(Other),col="black") +  
geom_vline(xintercept=autre_out[autre_out$True=="Other","Other"],col="yellow") + geom_vline(xintercept=autre_out["Case_163","Other"],col="purple",linetype=2)
hyp

The Threshold Landscape

library(tidyr)
## Warning: package 'tidyr' was built under R version 3.3.2
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
prob_plot = function(class_name,df){
    dat = df[df$True==class_name,]
    dat = dat[order(-dat[,class_name]),]
    dat$Sample = row.names(dat)
    dat= dat %>% gather(class,prob,ERG:Phlike)
    
    my.order = unique(dat$Sample)
    dat$Sample <- factor(dat$Sample, levels = my.order)
    
    #classes = unique(dat$class)
    #dat$class <- factor(dat$class,levels=c(class_name,classes[classes!=class_name]))
    
    lab_cols = ifelse(dat$batch == "Lund","#f9b52c","#2cabf9")
    
    
    gg <- ggplot(dat)
    gg + geom_bar(aes(x=Sample,y=prob,fill=class),stat='identity') + theme_bw() +  theme(axis.text.x=element_blank(),axis.ticks.x=element_line(color=lab_cols,size=5),axis.title.y=element_text(size=18),axis.title.x=element_text(size=18), title=element_text(size=18), legend.position="none") + labs(y="Probability", title=paste0("Truth Class ",class_name)) 
}

prob_plot = function(class_name,df){
    dat = df[df$True==class_name,]
    dat = dat[order(-dat[,class_name]),]
    dat$Sample = row.names(dat)
    dat= dat %>% gather(class,prob,ERG:Phlike)
    
    my.order = unique(dat$Sample)
    dat$Sample <- factor(dat$Sample, levels = my.order)
    
    #classes = unique(dat$class)
    #dat$class <- factor(dat$class,levels=c(class_name,classes[classes!=class_name]))
    
    lab_cols = ifelse(dat$batch == "Lund","#f9b52c","#2cabf9")
    
    
    gg <- ggplot(dat)
    gg + geom_bar(aes(x=Sample,y=prob,fill=class),stat='identity') + theme_bw() +  theme(axis.text.x=element_blank(),axis.ticks.x=element_line(color=lab_cols,size=5),axis.title.y=element_text(size=18),axis.title.x=element_text(size=18), title=element_text(size=18), legend.position="none") + labs(y="Probability", title=paste0("Truth Class ",class_name)) 
}


#Add a variable to discriminate whether batch is from Lund or St.Judes
autre_out$batch = ifelse(grepl("Case",row.names(autre_out)),"Lund","St.Judes")

prob_plot('Phlike',autre_out)

prob_plot('ETV',autre_out)

prob_plot('ERG',autre_out)

prob_plot('Other',autre_out)